task:
  train_data:
    input_path: ''
    is_training: true
    global_batch_size: 64
    seq_length: 2048
    drop_remainder: true
    shuffle_buffer_size: 5000
  validation_data:
    input_path: ''
    is_training: false
    global_batch_size: 4
    drop_remainder: true
    seq_length: 2048
trainer:
  checkpoint_interval: 100
  max_to_keep: 20
  summary_interval: 10
  steps_per_loop: 10
  train_steps: 2000
  validation_interval: 10
  optimizer_config:
    optimizer:
      type: 'adamw'
    learning_rate:
      type: polynomial
      polynomial:
        initial_learning_rate: 0.001
        end_learning_rate: 0.00005
        decay_steps: 2000
    warmup:
      polynomial:
        warmup_steps: 100

